#####
# Crossing borders: SHP data merging
#####

#####
# Load the SHP data
shp <- haven::read_dta(here("data", "cleaning", "SHP", "shp_panel_1999_2017.dta")); setDT(shp)

# there are three bfs numbers with multiple destinations in 2019:
# 4555
# 4915
# 4511
# shp[ bfsnr %in% c(4555, 4915, 4511), table(year)]

####----
# Clean up the bfs numbers for a clean merge

# forall but these three, just use the unique destination
shp[ unique(cw[!bfs %in% c(4555, 4915, 4511), .(bfs, bfs19)]), on = c(bfsnr="bfs"), bfs19 := bfs19]

# for the other three, do it manually to make sure that it is correct
shp[bfsnr %in% c(4555, 4915, 4511), bfs19 := fcase(
  bfsnr == 4555, 4551,
  bfsnr == 4915, 4506,
  bfsnr == 4511, 4511
)]

# fix observations with missing bfs in the raw data.

# Drop one ID with absolutely no info on bfsnr
shp <- shp[ idpers != "6673502"]

# Use most recent observed value for the other cases
setkey(shp, idhouse, idpers, year, shpwave)

NAreplace <- function(id, year){ # this function assumes both columns are sorted by year!
  missing_cell <- is.na(id)
  missing_year <- which(year == year[is.na(id)])
  
  id[missing_cell] <- id[missing_year - 1]
  return(id) 
}

shp[
  idpers %in% shp[is.na(bfs19), idpers],
  bfs19 := NAreplace(bfs19, year),
  by = idpers
]

rm(NAreplace)

# end BFS clean up
####-----

# drop respondents with missing age or age < 18
shp <- shp[age >= 18]

vars <- c("satis_fin", "cmj_risk_unempl", "housingexp", "opin_army", "trust_gov", "satis_demo", "opin_eu", "opin_foreign", "opin_taxation",
          "polit_interest", "nation", "pr01", "pr04", "pe16", "idpers", "weight_i_pop", "idhouse", "weight_h_pop", "hhinc_net", "age", 
          "gender", "educat", "wstat", "civsta", "bfs19", "year")

shp <- shp[, ..vars]

#####
# Merge with cb
#Filter to cb columns we need

cols_with_border15 <- grep("^border15_", names(cb), value = TRUE)
cols_to_exclude <- c("border15_1987", "border15_1988", "border15_1989", "border15_1990", 
                     "border15_1991", "border15_1992", "border15_1993", "border15_1994", 
                     "border15_1995", "border15_1996", "border15_1997", "border15_1998",
                     "border15_2017", "border15_2018", "border15_2019")

# Filter out the columns to exclude
cols_with_border15 <- setdiff(cols_with_border15, cols_to_exclude)

# Combine the explicitly named columns with those that start with "border15_"
selected_cols <- c("bfs19", "BR", "year", "travelMUNmin", "transBorder15", "freeBorder15", "canton_name", "languagereg2019", cols_with_border15)

cb <- unique(cb[, ..selected_cols])
names(cb)

shp <- merge(shp, cb, by = c("bfs19", "year"), all.x = T, all.y = F)

# remove the autor columns for now

#####
# Save the shp data in a temporary location

qsave(shp, here("data", "shp.qs"))
